/*
	Estimates college graduate quality assuming that workers with only a single
	college listed but no degree received their bachelor's degree there.
	Output:	Estimates_z_c_imputeMissing.dta
			Estimates_q_j_imputeMissing.dta
*/
 
local seed "C:\Users\jsock\Dropbox\Research\GD\International"

local dataPath "`seed'/Data"
local inputPath "`seed'/InputData"
local figurePath "`seed'/Replication/Figures"
local tablePath "`seed'/Replication/Tables"
local estimatePath "`seed'/Replication/Estimates"
local tempPath "`seed'/Replication/TempData"

********************************************************
* Read in country gdp to get universitycountry gdp
********************************************************

preserve

	clear
	 
	insheet using "`inputPath'/Exchange_rates_2022.csv" , comma
	
	keep if year >= 2010 
	keep if year <= 2021 
	
	bys country_glassdoor: egen avg_gdppw = mean(gdppw)
	generate log_gdppw_uc = ln(avg_gdppw)
	
	bys country_glassdoor: keep if _n == 1

	keep country_glassdoor iso log_gdppw_uc	
	keep if log_gdppw_uc != .
	sort country_glassdoor
	
	save "`tempPath'/Country_gdppw.dta" , replace
	
restore 

********************************************************
* Read in country gdp to get universitycountry gdp
********************************************************

clear 
set more off
set matsize 10000
set scheme s2mono

* Set path and load data 
insheet using "`dataPath'/Salaries_international_dataset.csv", comma
drop v1

drop if jobtitle == ""

drop metro shortname  
drop city basecurrency country_iso  
drop gender birthyear
drop sectorname iscurrentjobflag 

*--------------------------------
* Thresholds used for sample selection
*--------------------------------
.
scalar country_premia_thresh = 25 

scalar selection_thresh = 25

scalar school_thresh = 25

*--------------------------------
* Exclude users that leave more than 10 reviews
*--------------------------------

sort fk_userid yearofsalary dateval salid

by fk_userid : gen obsNum = _n

by fk_userid : gen userReviews = _N

drop if userReviews > 10

*--------------------------------
* Generate additional variables
*--------------------------------

generate exp = yearsofrelevantexpnumber
generate exp_sqrd = exp ^ 2
drop yearsofrelevantexpnumber

generate logbase = ln(basesalary * ppp_xrat)

generate log_gdppw = ln(gdppw)

drop if ppp_xrat == .

*--------------------------------
* Exclude outliers in base pay
*--------------------------------

scalar scalingThresh = 10

generate realbase = basesalary * ppp_xrat
generate outside_thresh =  (realbase < (1/scalingThresh) * gdppw) | (realbase > scalingThresh * gdppw) 
drop realbase

*-------------------------------- 
* Drop Singapore Polytechnics which aren't considered by WHED
*--------------------------------

drop if school == "Nanyang Polytechnic"
drop if school == "Ngee Ann Polytechnic"
drop if school == "Republic Polytechnic"
drop if school == "Singapore Polytechnic"
drop if school == "Temasek Polytechnic"

*--------------------------------
* Generate indicators for wage in same country as university or not
*--------------------------------

generate home_country = countryname == universitycountry & universitycountry != ""

generate foreign_country = countryname != universitycountry & universitycountry != ""

*--------------------------------
* Generate indicators for valid in education analysis
*--------------------------------

* For first degree

generate uniDegree = degree == "BACHELORS" | (degree == "missing" & school_2 == "")

generate hasSchool = school != ""

* For second degree

generate hasDegree_2 = degree_2 != "UNMATCHED" & degree_2 != "missing" & degree_2 != ""  & degree != "HIGHSCHOOL"

generate uniDegree_2 = degree_2 != "ASSOCIATES" & degree_2 != "DIPLOMA"  & degree_2 != "HIGHSCHOOL" & degree_2 != "BACHELORS"

generate hasSchool_2 = school_2 != ""

replace degree_2 = "NONE" if degree_2 == ""

* Calculate share with usable info
generate usableResume = hasSchool & universitycountry != "" & degree != "missing" 
sum usableResume, detail
scalar share_usable_resume = r(mean)

*--------------------------------
* Add work country name
*--------------------------------

generate country_glassdoor = countryname

merge m:1 country_glassdoor using "`tempPath'/Country_gdppw.dta"
drop _merge

rename iso work_country_iso
rename log_gdppw_uc log_gdppw_work_country 

drop country_glassdoor 

*--------------------------------
* Add country of study name
*--------------------------------

generate country_glassdoor = universitycountry

merge m:1 country_glassdoor using "`tempPath'/Country_gdppw.dta"
drop _merge

rename iso university_country_iso
rename log_gdppw_uc log_gdppw_university_country 

drop country_glassdoor 

*--------------------------------
* Drop unused obs
*--------------------------------

drop if employertypecode == "SELF_EMPLOYED" 

drop if outside_thresh

generate valid_educ = uniDegree & hasSchool & universitycountry != ""
keep if valid_educ

********************************************************
* SAMPLE SIZE FOR COUNTRY PREMIA Z_C (MOVERS)
******************************************************** 

*--------------------------------
* Determine top destinations
*--------------------------------

sort fk_userid yearofsalary dateval salid

by fk_userid: generate destination = countryname[_n+1]

generate migrant = destination != countryname & destination != ""

sort destination 
by destination : egen destinationMigrants = sum(migrant)
by destination : replace destinationMigrants = . if destination == ""
by destination : replace destinationMigrants = . if _n > 1

replace destinationMigrants = destinationMigrants * -1

sort destinationMigrants 

* Determine top destination (1.0& of migrants)

egen totalMigrants = sum(destinationMigrants * -1)
	
generate migrant_share = -1 * destinationMigrants / totalMigrants
	
generate top_destination = migrant_share >= 0.01 & migrant_share != .

bys destination : egen topDestination = max(top_destination)

*--------------------------------
* Determine # of migrants to top ten destination for each origin
*--------------------------------

bys countryname : egen migrantsOverall = sum(migrant)  

bys countryname : egen migrantsTop = sum(migrant * topDestination)  

generate valid_country_premia = migrantsTop >= country_premia_thresh & migrantsTop != .

********************************************************
* ESTIMATE SKILL LOSS
********************************************************

sort fk_userid yearofsalary dateval salid
 
by fk_userid: gen prev_country = countryname[_n-1]

*--------------------------------
* Determine if migrated
*--------------------------------

generate migrated = countryname != prev_country & prev_country != ""

sort fk_userid yearofsalary dateval salid
by fk_userid: gen already_migrated = sum(migrated)

*--------------------------------
* Generate variable capturing delta_gdppw for migrants - between countries of work
*--------------------------------

sort fk_userid yearofsalary dateval salid
by fk_userid: gen prev_gdp = log_gdppw_work_country[_n-1]
generate delta_log_gdppw = abs(log_gdppw_work_country - prev_gdp)

generate migrant_delta_gdppw_work = 0
by fk_userid: replace migrant_delta_gdppw_work = delta_log_gdppw if migrated == 1 
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-1] if migrated == 0 & migrated[_n-1] == 1
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-2] if migrated == 0 & migrated[_n-1] == 0 & migrated[_n-2] == 1
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-3] if migrated == 0 & migrated[_n-1] == 0 & migrated[_n-2] == 0 & migrated[_n-3] == 1
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-4] if migrated == 0 & migrated[_n-1] == 0 & migrated[_n-2] == 0 & migrated[_n-3] == 0 & migrated[_n-4] == 1
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-5] if migrated == 0 & migrated[_n-1] == 0 & migrated[_n-2] == 0 & migrated[_n-3] == 0 & migrated[_n-4] == 0 & migrated[_n-5] == 1
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-6] if migrated == 0 & migrated[_n-1] == 0 & migrated[_n-2] == 0 & migrated[_n-3] == 0 & migrated[_n-4] == 0 & migrated[_n-5] == 0 & migrated[_n-6] == 1
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-7] if migrated == 0 & migrated[_n-1] == 0 & migrated[_n-2] == 0 & migrated[_n-3] == 0 & migrated[_n-4] == 0 & migrated[_n-5] == 0 & migrated[_n-6] == 0 & migrated[_n-7] == 1
by fk_userid: replace migrant_delta_gdppw_work = migrant_delta_gdppw_work[_n-8] if migrated == 0 & migrated[_n-1] == 0 & migrated[_n-2] == 0 & migrated[_n-3] == 0 & migrated[_n-4] == 0 & migrated[_n-5] == 0 & migrated[_n-6] == 0 & migrated[_n-7] == 0 & migrated[_n-8] == 1

*--------------------------------
* Generate coarse desination FE for migration 
*	Keep top destinations alone, group others by continent
*--------------------------------

	* Top destinations
	
	preserve

		keep destination topDestination
		rename destination countryname
		rename topDestination is_topDestination
		
		bys countryname : keep if _n == 1
		
		tempfile in_results
		save `in_results', emptyok
		
	restore

	merge m:1 countryname using `in_results'
	drop if _merge == 2
	drop _merge

	generate coarseDestination = ""
	replace coarseDestination = countryname if is_topDestination == 1
	drop is_topDestination

	* Continents
	
	preserve

		clear
		
		insheet using "`inputPath'\Country_continents.csv" , comma
		
		rename v2 countryname
		rename v7 continent
		keep countryname continent
		
		bys countryname : keep if _n == 1
		
		replace countryname = "Czech Republic" if countryname == "Czechia"
		replace countryname = "Iran" if countryname == "Iran (Islamic Republic of)"
		replace countryname = "Russia" if countryname == "Russian Federation"
		replace countryname = "Hong Kong" if countryname == "China, Hong Kong Special Administrative Region"
		replace countryname = "South Korea" if countryname == "Republic of Korea"
		replace countryname = "United Kingdom" if countryname == "United Kingdom of Great Britain and Northern Ireland"
		replace countryname = "United States" if countryname == "United States of America"
		replace countryname = "Vietnam" if countryname == "Viet Nam"
		
		tempfile in_results
		save `in_results', emptyok
		
	restore

	merge m:1 countryname using `in_results'
	drop if _merge == 2
	drop _merge

	replace coarseDestination = continent if coarseDestination == "" & valid_country_premia
	
	* Destination FE
	
	tab coarseDestination if valid_country_premia == 1, gen(c_)
	foreach my_var of varlist c_*{
		replace `my_var' = 0 if ~(migrated | already_migrated >= 1)
	}	
	
	
********************************************************
* STEP 1: ESTIMATE COUNTRY FE Z_C
********************************************************

local ending "imputeMissing"

reghdfe logbase exp exp_sqrd c_* migrant_delta_gdppw_work if valid_country_premia , absorb(fk_userid fe_country=countryname yearofsalary ) vce(cluster countryname)

bys countryname : egen obs_`ending' =sum(e(sample))

bys countryname : egen z_c_`ending' = max(fe_country)

generate hasCountryPremia = z_c_`ending' != .

*--------------------------------
* GENERATE tau_`ending' FOR WORKERS
*--------------------------------

generate tau_`ending' = 0
foreach my_var of varlist c_* {
	
	bys coarseDestination : egen fill_`my_var' = max(`my_var')
	replace tau_`ending' = tau_`ending' + `my_var' * _b["`my_var'"] 
	replace tau_`ending' = tau_`ending' + fill_`my_var' * _b["`my_var'"] if `my_var' == 0 & universitycountry != countryname 
	drop fill_*
	
}
foreach my_var of varlist migrant_delta_gdppw_work {

	replace tau_`ending' = tau_`ending' + `my_var' * _b["`my_var'"]
	replace tau_`ending' = tau_`ending' + abs(log_gdppw_work_country - log_gdppw_university_country) * _b["`my_var'"] if `my_var' == 0 & universitycountry != countryname 

}

generate logbase_z_c_`ending' = logbase - z_c_`ending' - tau_`ending'

drop fe_country

preserve

	keep if z_c != .
	
	keep countryname z_c_`ending'
	
	bys countryname : keep if _n == 1
	
	generate sort_z_c = -1 * z_c
	
	sort sort_z_c
	
	drop sort_z_c

	save "`estimatePath'\Estimates_z_c_`ending'.dta" , replace

restore

********************************************************
* STEP 1B: KEEP ONLY EDUCATION SAMPLE NOW THAT MOVERS HAVE BEEN USED
********************************************************

keep if valid_educ & hasCountryPremia & school != ""

********************************************************
* Determine step 2 sample thresholds 
********************************************************

bys school: egen schoolObs = sum(hasCountryPremia & valid_educ)

bys school: egen schoolObsLocal = sum(hasCountryPremia & valid_educ & home_country)

generate valid_school = schoolObsLocal >= school_thresh 

********************************************************
* ESTIMATE SCHOOL FE 
********************************************************

reghdfe logbase_z_c_`ending' exp exp_sqrd if valid_educ & valid_school & home_country & universitycountry != "", absorb(fe_school=school yearofsalary)

bys school : egen obs_school_`ending' =sum(e(sample))

bys school : egen q_j_`ending' = max(fe_school)

drop fe_school

preserve

	keep if q_j_`my_ending' != .
	
	keep school q_j_`my_ending'
	
	order school q_j_`my_ending'
	
	bys school : keep if _n == 1
	
	generate sort_qj = -1 * q_j	
	sort sort_qj
	
	drop sort_qj

	save "`estimatePath'\Estimates_q_j_`ending'.dta" , replace

restore
